#include "stdafx.h"

#include <stdio.h>
#include <stdlib.h>
#include "hnum_pcsp_defs.h"
namespace harlinn
{
    namespace numerics
    {
        namespace SuperLU
        {
            namespace Complex
            {


#define PRINT_SPIN_TIME(where)  { \
  if ( t2 > 0.001 ) { \
      printf("[%d] Panel%6d on P%2d waits s-node%6d for %8.2f msec.\n",\
	     where, jcol, pnum, kcol, t2*1e3);\
      fflush(stdout); } }


                #ifdef PREDICT_OPT

                /* comparison function used by qsort() - in decreasing order */
                int numcomp(desc_eft_t *a, desc_eft_t *b)
                {
                    if ( a->eft < b->eft )
	                return -1;
                    else if ( a->eft > b->eft )
	                return 1;
                    else
	                return 0;
                }
                #endif

                void
                pcgstrf_panel_bmod(
		                   const int  pnum, /* process number */
		                   const int  m,    /* number of rows in the matrix */
		                   const int  w,    /* current panel width */
		                   const int  jcol, /* leading column of the current panel */
		                   const int  bcol, /* first column of the farthest busy snode*/
		                   int   *inv_perm_r,/* in; inverse of the row pivoting */
		                   int   *etree,     /* in */
		                   int   *nseg,      /* modified */
		                   int   *segrep,    /* modified */
		                   int   *repfnz,    /* modified, size n-by-w */
		                   int   *panel_lsub,/* modified */
		                   int   *w_lsub_end,/* modified */
		                   int   *spa_marker,/* modified; size n-by-w */
		                   complex *dense, /* modified, size n-by-w */
		                   complex *tempv, /* working array - zeros on input/output */
		                   pxgstrf_shared_t *pxgstrf_shared /* modified */
		                   )
                {
                /*
                 * -- SuperLU MT routine (version 2.0) --
                 * Lawrence Berkeley National Lab, Univ. of California Berkeley,
                 * and Xerox Palo Alto Research Center.
                 * September 10, 2007
                 *
                 * Purpose
                 * =======
                 *
                 *    Performs numeric block updates (sup-panel) in topological order.
                 *    It features combined 1D and 2D blocking of the source updating s-node.
                 *    It consists of two steps:
                 *       (1) accumulates updates from "done" s-nodes.
                 *       (2) accumulates updates from "busy" s-nodes.
                 *
                 *    Before entering this routine, the nonzeros of the original A in
                 *    this panel were already copied into the SPA dense[n,w].
                 *
                 * Updated/Output arguments
                 * ========================
                 *    L[*,j:j+w-1] and U[*,j:j+w-1] are returned collectively in the
                 *    m-by-w vector dense[*,w]. The locations of nonzeros in L[*,j:j+w-1]
                 *    are given by lsub[*] and U[*,j:j+w-1] by (nseg,segrep,repfnz).
                 *
                 */
                    GlobalLU_t *Glu = pxgstrf_shared->Glu;  /* modified */
                    Gstat_t *Gstat = pxgstrf_shared->Gstat; /* modified */
                    register int j, k, ksub;
                    register int fsupc, nsupc, nsupr, nrow;
                    register int kcol, krep, ksupno, dadsupno;
                    register int jj;	      /* index through each column in the panel */
                    int          *xsup, *xsup_end, *supno;
                    int          *lsub, *xlsub, *xlsub_end;
                    int          *repfnz_col; /* repfnz[] for a column in the panel */
                    complex       *dense_col;  /* dense[] for a column in the panel */
                    int          *col_marker; /* each column of the spa_marker[*,w] */
                    int          *col_lsub;   /* each column of the panel_lsub[*,w] */
                    static   int first = 1, rowblk, colblk;

                #ifdef PROFILE
                    double   t1, t2; /* temporary time */
                #endif
    
                #ifdef PREDICT_OPT    
                    register float pmod, max_child_eft = 0, sum_pmod = 0, min_desc_eft = 0;
                    register float pmod_eft;
                    register int   kid, ndesc = 0;
                #endif
    
                #if ( DEBUGlevel>=2 )
                    int dbg_addr = 0*m;
                #endif
    
                    if ( first ) {
	                rowblk   = sp_ienv(4);
	                colblk   = sp_ienv(5);
	                first = 0;
                    }
    
                    xsup      = Glu->xsup;
                    xsup_end  = Glu->xsup_end;
                    supno     = Glu->supno;
                    lsub      = Glu->lsub;
                    xlsub     = Glu->xlsub;
                    xlsub_end = Glu->xlsub_end;

                #if ( DEBUGlevel>=2 )
                    /*if (jcol >= LOCOL && jcol <= HICOL)
                    check_panel_dfs_list(pnum, "begin", jcol, *nseg, segrep);*/
                if (jcol == BADPAN)
                    printf("(%d) Enter pcgstrf_panel_bmod() jcol %d,BADCOL %d,dense_col[%d] %.10f\n",
	                   pnum, jcol, BADCOL, BADROW, dense[dbg_addr+BADROW]);
                #endif    

                    /* --------------------------------------------------------------------
                       For each non-busy supernode segment of U[*,jcol] in topological order,
                       perform sup-panel update.
                       -------------------------------------------------------------------- */
                    k = *nseg - 1;
                    for (ksub = 0; ksub < *nseg; ++ksub) {
	                /*
	                 * krep = representative of current k-th supernode
	                 * fsupc = first supernodal column
	                 * nsupc = no of columns in a supernode
	                 * nsupr = no of rows in a supernode
	                 */
                        krep = segrep[k--];
	                fsupc = xsup[supno[krep]];
	                nsupc = krep - fsupc + 1;
	                nsupr = xlsub_end[fsupc] - xlsub[fsupc];
	                nrow = nsupr - nsupc;

                #ifdef PREDICT_OPT
	                pmod = Gstat->procstat[pnum].fcops;
                #endif
	    
	                if ( nsupc >= colblk && nrow >= rowblk ) {
	                    /* 2-D block update */
                #ifdef GEMV2
	                    pcgstrf_bmod2D_mv2(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, 
			                       nrow, repfnz, panel_lsub, w_lsub_end, 
			                       spa_marker, dense, tempv, Glu, Gstat);
                #else
	                    pcgstrf_bmod2D(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow,
			                   repfnz, panel_lsub, w_lsub_end, spa_marker,
			                   dense, tempv, Glu, Gstat);
                #endif
	                } else {
	                    /* 1-D block update */
                #ifdef GEMV2
	                    pcgstrf_bmod1D_mv2(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr,
			                       nrow, repfnz, panel_lsub, w_lsub_end, 
			                       spa_marker, dense, tempv, Glu, Gstat);
                #else
	                    pcgstrf_bmod1D(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow,
			                   repfnz, panel_lsub, w_lsub_end, spa_marker,
			                   dense, tempv, Glu, Gstat);
                #endif
	                }
	
                #ifdef PREDICT_OPT
	                pmod = Gstat->procstat[pnum].fcops - pmod;
	                kid = (Glu->pan_status[krep].size > 0) ?
	                    krep : (krep + Glu->pan_status[krep].size);
	                desc_eft[ndesc].eft = cp_panel[kid].est + cp_panel[kid].pdiv;
	                desc_eft[ndesc++].pmod = pmod;
                #endif
	
                #if ( DEBUGlevel>=2 )
                if (jcol == BADPAN)
                    printf("(%d) non-busy update: krep %d, repfnz %d, dense_col[%d] %.10e\n",
	                   pnum, krep, repfnz[dbg_addr+krep], BADROW, dense[dbg_addr+BADROW]);
                #endif

                    } /* for each updating supernode ... */
    
                #if ( DEBUGlevel>=2 )
                if (jcol == BADPAN)
                    printf("(%d) After non-busy update: dense_col[%d] %.10e\n",
	                   pnum, BADROW, dense[dbg_addr+BADROW]);
                #endif
    
                    /* ---------------------------------------------------------------------
                     * Now wait for the "busy" s-nodes to become "done" -- this amounts to
                     * climbing up the e-tree along the path starting from "bcol".
                     * Several points are worth noting:
                     *
                     *  (1) There are two possible relations between supernodes and panels
                     *      along the path of the e-tree:
                     *      o |s-node| < |panel|
                     *        want to climb up the e-tree one column at a time in order
                     *        to achieve more concurrency
                     *      o |s-node| > |panel|
                     *        want to climb up the e-tree one panel at a time; this
                     *        processor is stalled anyway while waiting for the panel.
                     *
                     *  (2) Need to accommodate new fills, append them in panel_lsub[*,w].
                     *      o use an n-by-w marker array, as part of the SPA (not scalable!)
                     *
                     *  (3) Symbolically, need to find out repfnz[S, w], for each (busy)
                     *      supernode S.
                     *      o use dense[inv_perm_r[kcol]], filter all zeros
                     *      o detect the first nonzero in each segment
                     *        (at this moment, the boundary of the busy supernode/segment
                     *         S has already been identified)
                     *
                     * --------------------------------------------------------------------- */

                    kcol = bcol;
                    while ( kcol < jcol ) {
                        /* Pointers to each column of the w-wide arrays. */
	                repfnz_col = repfnz;
	                dense_col = dense;
	                col_marker = spa_marker;
	                col_lsub = panel_lsub;

	                /* Wait for the supernode, and collect wait-time statistics. */
	                if ( pxgstrf_shared->spin_locks[kcol] ) {
                #ifdef PROFILE
	                    TIC(t1);
                #endif
	                    await( &pxgstrf_shared->spin_locks[kcol] );

                #ifdef PROFILE
	                    TOC(t2, t1);
	                    Gstat->panstat[jcol].pipewaits++;
	                    Gstat->panstat[jcol].spintime += t2;
	                    Gstat->procstat[pnum].spintime += t2;
                #ifdef DOPRINT
	                    PRINT_SPIN_TIME(1);
                #endif
                #endif		
	                }
	
                        /* Find leading column "fsupc" in the supernode that
                           contains column "kcol" */
	                ksupno = supno[kcol];
	                fsupc = kcol;

                #if ( DEBUGlevel>=2 )
	                /*if (jcol >= LOCOL && jcol <= HICOL)    */
                  if ( jcol==BADCOL )
                    printf("(%d) pcgstrf_panel_bmod[1] kcol %d, ksupno %d, fsupc %d\n",
	                   pnum, kcol, ksupno, fsupc);
                #endif
	
	                /* Wait for the whole supernode to become "done" --
	                   climb up e-tree one column at a time */
	                do {
	                    krep = SUPER_REP( ksupno );
	                    kcol = etree[kcol];
	                    if ( kcol >= jcol ) break;
	                    if ( pxgstrf_shared->spin_locks[kcol] ) {
                #ifdef PROFILE
		                TIC(t1);
                #endif
		                await ( &pxgstrf_shared->spin_locks[kcol] );

                #ifdef PROFILE
		                TOC(t2, t1);
		                Gstat->panstat[jcol].pipewaits++;
		                Gstat->panstat[jcol].spintime += t2;
		                Gstat->procstat[pnum].spintime += t2;
                #ifdef DOPRINT
		                PRINT_SPIN_TIME(2);
                #endif
                #endif		
	                    }

	                    dadsupno = supno[kcol];

                #if ( DEBUGlevel>=2 )
	                    /*if (jcol >= LOCOL && jcol <= HICOL)*/
                if ( jcol==BADCOL )
                    printf("(%d) pcgstrf_panel_bmod[2] krep %d, dad=kcol %d, dadsupno %d\n",
	                   pnum, krep, kcol, dadsupno);
                #endif	    

	                } while ( dadsupno == ksupno );

	                /* Append the new segment into segrep[*]. After column_bmod(),
	                   copy_to_ucol() will use them. */
	                segrep[*nseg] = krep;
                        ++(*nseg);
        
	                /* Determine repfnz[krep, w] for each column in the panel */
	                for (jj = jcol; jj < jcol + w; ++jj, dense_col += m, 
	                       repfnz_col += m, col_marker += m, col_lsub += m) {
	                    /*
	                     * Note: relaxed supernode may not form a path on the e-tree,
	                     *       but its column numbers are contiguous.
	                     */
                #ifdef SCATTER_FOUND
 	                    for (kcol = fsupc; kcol <= krep; ++kcol) {
		                if ( col_marker[inv_perm_r[kcol]] == jj ) {
		                    repfnz_col[krep] = kcol;

 		                    /* Append new fills in panel_lsub[*,jj]. */
		                    j = w_lsub_end[jj - jcol];
                /*#pragma ivdep*/
		                    for (k = xlsub[krep]; k < xlsub_end[krep]; ++k) {
			                ksub = lsub[k];
			                if ( col_marker[ksub] != jj ) {
			                    col_marker[ksub] = jj;
			                    col_lsub[j++] = ksub;
			                }
		                    }
		                    w_lsub_end[jj - jcol] = j;

		                    break; /* found the leading nonzero in the segment */
		                }
	                    }

                #else
	                    for (kcol = fsupc; kcol <= krep; ++kcol) {
		                if ( (dense_col[inv_perm_r[kcol]].r != 0.0) || (dense_col[inv_perm_r[kcol]].i != 0.0) ) {
		                    repfnz_col[krep] = kcol;
		                    break; /* Found the leading nonzero in the U-segment */
		                }
	                    }

	                    /* In this case, we always treat the L-subscripts of the 
	                       busy s-node [kcol : krep] as the new fills, even if the
	                       corresponding U-segment may be all zero. */

	                    /* Append new fills in panel_lsub[*,jj]. */
	                    j = w_lsub_end[jj - jcol];
                /*#pragma ivdep*/
	                    for (k = xlsub[krep]; k < xlsub_end[krep]; ++k) {
	                        ksub = lsub[k];
		                if ( col_marker[ksub] != jj ) {
		                    col_marker[ksub] = jj;
		                    col_lsub[j++] = ksub;
		                }
	                    }
	                    w_lsub_end[jj - jcol] = j;
                #endif

                #if ( DEBUGlevel>=2 )
                if (jj == BADCOL) {
                printf("(%d) pcgstrf_panel_bmod[fills]: jj %d, repfnz_col[%d] %d, inv_pr[%d] %d\n",
	                   pnum, jj, krep, repfnz_col[krep], fsupc, inv_perm_r[fsupc]);
                printf("(%d) pcgstrf_panel_bmod[fills] xlsub %d, xlsub_end %d, #lsub[%d] %d\n",
                       pnum,xlsub[krep],xlsub_end[krep],krep, xlsub_end[krep]-xlsub[krep]);
                }
                #endif	   
	                } /* for jj ... */

                #ifdef PREDICT_OPT
	                pmod = Gstat->procstat[pnum].fcops;
                #endif
	
	                /* Perform sup-panel updates - use combined 1D + 2D updates. */
	                nsupc = krep - fsupc + 1;
	                nsupr = xlsub_end[fsupc] - xlsub[fsupc];
	                nrow = nsupr - nsupc;
	                if ( nsupc >= colblk && nrow >= rowblk ) {
	                    /* 2-D block update */
                #ifdef GEMV2
	                    pcgstrf_bmod2D_mv2(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr,
			                       nrow, repfnz, panel_lsub, w_lsub_end, 
			                       spa_marker, dense, tempv, Glu, Gstat);
                #else
	                    pcgstrf_bmod2D(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow,
			                   repfnz, panel_lsub, w_lsub_end, spa_marker,
			                   dense, tempv, Glu, Gstat);
                #endif
	                } else {
	                    /* 1-D block update */
                #ifdef GEMV2
	                    pcgstrf_bmod1D_mv2(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr,
			                       nrow, repfnz, panel_lsub, w_lsub_end, 
			                       spa_marker, dense, tempv, Glu, Gstat);
                #else
	                    pcgstrf_bmod1D(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow,
			                   repfnz, panel_lsub, w_lsub_end, spa_marker,
			                   dense, tempv, Glu, Gstat);
                #endif
	                }

                #ifdef PREDICT_OPT
	                pmod = Gstat->procstat[pnum].fcops - pmod;
	                kid = (pxgstrf_shared->pan_status[krep].size > 0) ?
	                       krep : (krep + pxgstrf_shared->pan_status[krep].size);
	                desc_eft[ndesc].eft = cp_panel[kid].est + cp_panel[kid].pdiv;
	                desc_eft[ndesc++].pmod = pmod;
                #endif
	
                #if ( DEBUGlevel>=2 )
                if (jcol == BADPAN)
                    printf("(%d) After busy update: dense_col[%d] %.10f\n",
	                   pnum, BADROW, dense[dbg_addr+BADROW]);
                #endif
	
	                /* Go to the parent of "krep" */
	                kcol = etree[krep];
	
                    } /* while kcol < jcol ... */
    
                #if ( DEBUGlevel>=2 )
                    /*if (jcol >= LOCOL && jcol <= HICOL)*/
                if ( jcol==BADCOL )
                    check_panel_dfs_list(pnum, "after-busy", jcol, *nseg, segrep);
                #endif

                #ifdef PREDICT_OPT
                    qsort(desc_eft, ndesc, sizeof(desc_eft_t), (int(*)())numcomp);
                    pmod_eft = 0;
                    for (j = 0; j < ndesc; ++j) {
	                pmod_eft = SUPERLU_MAX( pmod_eft, desc_eft[j].eft ) + desc_eft[j].pmod;
                    }

                    if ( ndesc == 0 ) {
	                /* No modifications from descendants */
	                pmod_eft = 0;
	                for (j = cp_firstkid[jcol]; j != EMPTY; j = cp_nextkid[j]) {
	                    kid = (pxgstrf_shared->pan_status[j].size > 0) ? 
			                j : (j + pxgstrf_shared->pan_status[j].size);
	                    pmod_eft = SUPERLU_MAX( pmod_eft,
			   	                cp_panel[kid].est + cp_panel[kid].pdiv );
	                }
                    }
    
                    cp_panel[jcol].est = pmod_eft;
    
                #endif

                }

                static int
                check_panel_dfs_list(int pnum, char *msg, int jcol, int nseg, int *segrep)
                {
                    register int k;

                    printf("(%d) pcgstrf_panel_bmod(%s) jcol %d, nseg %d in top. order ->\n",
	                   pnum, msg, jcol, nseg);
                    for (k = nseg-1; k >= 0; --k)
	                printf("(%d) segrep-%d ", pnum, segrep[k]);
                    printf("\n");
                    fflush(stdout);
                    return 0;
                }
            };
        };
    };
};